#pragma  once
/**
 * @file mtgp32-fast.h
 *
 * @brief Mersenne Twister for Graphic Processors (mtgp32), which
 * generates 32-bit unsigned integers and single precision floating
 * point numbers based on IEEE 754 format.
 *
 * @author Mutsuo Saito (Hiroshima University)
 * @author Makoto Matsumoto (Hiroshima University)
 *
 * Copyright (C) 2009 Mutsuo Saito, Makoto Matsumoto and
 * Hiroshima University. All rights reserved.
 *
 * The new BSD License is applied to this software, see LICENSE.txt
 */
#include <string.h>
#include <assert.h>
#include <stdio.h>
#include <stdint.h>

#include <amp.h>

/**
 * \struct MTGP32_PARAMS_FAST_T
 * MTGP32 parameters.
 * Some element is redundant to keep structure simple.
 *
 * \b pos is a pick up position which is selected to have good
 * performance on graphic processors.  3 < \b pos < Q, where Q is a
 * maximum number such that the size of status array - Q is a power of
 * 2.  For example, when \b mexp is 44497, size of 32-bit status array
 * is 696, and Q is 184, then \b pos is between 4 and 183. This means
 * 512 parallel calculations is allowed when \b mexp is 44497.
 *
 * \b poly_sha1 is SHA1 digest of the characteristic polynomial of
 * state transition function. SHA1 is calculated based on printing
 * form of the polynomial. This is important when we use parameters
 * generated by the dynamic creator which
 *
 * \b mask This is a mask to make the dimension of state space have
 * just Mersenne Prime. This is redundant.
 */
typedef struct MTGP32_PARAMS_FAST_T {
    int mexp;                        /**< Mersenne exponent. This is redundant. */
    int pos;                        /**< pick up position. */
    int sh1;                        /**< shift value 1. 0 < sh1 < 32. */
    int sh2;                        /**< shift value 2. 0 < sh2 < 32. */
    uint32_t tbl[16];                /**< a small matrix. */
    uint32_t tmp_tbl[16];        /**< a small matrix for tempering. */
    uint32_t flt_tmp_tbl[16];        /**< a small matrix for tempering and
                                 converting to float. */
    uint32_t mask;                /**< This is a mask for state space */
    /*terry: unsigned char*/unsigned int poly_sha1[21]; /**< SHA1 digest */
} mtgp32_params_fast_t;

/**
 * \struct MTGP32_STATUS_FAST_T
 * MTGP32 internal state array.
 * In this structure, we use \b larger_size and \b larger_mask to
 * avoid slow operation of remainder (%).
 *
 * \b size is the minimum needed size to represent linear space of
 * two element filed as 32-bit array.
 *
 * \b large_size is a minimum power of 2 which is larger than \b size.
 * \b array has this size.
 *
 * \b large_mask is a bit mask to update \b idx in a fast way.
 * \b idx is updated by `logical and' \b & with large_mask.
 */
typedef struct MTGP32_STATUS_FAST_T {
    int idx;                        /**< index */
    int size;                        /**< minimum needed size */
    int large_size;                /**< real size of array */
    int large_mask;                /**< bit mask to update \b idx */
    uint32_t array[2048];                /**< internal state array, size = 44497/32+1 as in mtgp32_init_state() */
} mtgp32_status_fast_t;

/**
 * \struct MTGP32_STATUS_FAST_T
 * MTGP32 interface structure
 */
typedef struct MTGP32_FAST_T {
    mtgp32_params_fast_t params; /**< parameters */
    mtgp32_status_fast_t *status; /**< internal state */
} mtgp32_fast_t;

/**
 * parameter constants tables for MEXP=11213.
 * \b size - \b pos > 256.
 */
extern mtgp32_params_fast_t mtgp32_params_fast_11213[128];
/**
 * parameter constants tables for MEXP=23209.
 * \b size - \b pos > 512.
 */
extern mtgp32_params_fast_t mtgp32_params_fast_23209[128];
/**
 * parameter constants tables for MEXP=44497.
 * \b size - \b pos > 1024.
 */
extern mtgp32_params_fast_t mtgp32_params_fast_44497[128];

int mtgp32_init(mtgp32_fast_t *mtgp32,
                const mtgp32_params_fast_t *para, uint32_t seed);
void get_sizes(const mtgp32_params_fast_t *para, int& size, int& large_size);
void mtgp32_init_state(uint32_t array[],
                      const mtgp32_params_fast_t *para, uint32_t seed)
                      restrict(cpu, amp);
int mtgp32_init_by_array(mtgp32_fast_t *mtgp32,
                         const mtgp32_params_fast_t *para,
                        uint32_t *array, int length);
int mtgp32_init_by_str(mtgp32_fast_t *mtgp32,
                       const mtgp32_params_fast_t *para,
                       char *str);
void mtgp32_free(mtgp32_fast_t *mtgp32);
void mtgp32_print_idstring(const mtgp32_fast_t *mtgp32, FILE *fp);

inline static void mtgp32_do_recursion(uint32_t *r, uint32_t x1,
                                       uint32_t x2, uint32_t y,
                                       int sh1, int sh2,
                                       uint32_t mask, uint32_t tbl[16])
                                       restrict(cpu, amp);
inline static void mtgp32_next_state(
    mtgp32_params_fast_t& params,
    mtgp32_status_fast_t *status
) restrict(cpu, amp);
//inline static uint32_t mtgp32_genrand_uint32(mtgp32_fast_t *mtgp32);
//inline static float mtgp32_genrand_close1_open2(mtgp32_fast_t *mtgp32);
//inline static float mtgp32_genrand_close_open(mtgp32_fast_t *mtgp32);
//inline static float mtgp32_genrand_open_close(mtgp32_fast_t *mtgp32);
inline static float mtgp32_genrand_open_open(
    const mtgp32_params_fast_t& params,
    mtgp32_status_fast_t *status
) restrict(cpu, amp);


/**
 * This function initializes the internal state array with a 32-bit
 * integer seed. The allocated memory should be freed by calling
 * mtgp32_free(). \b para should be one of the elements in the
 * parameter table (mtgp32-param-ref.c).
 *
 * This function is call by cuda program, because cuda program uses
 * another structure and another allocation method.
 *
 * @param[out] array MTGP internal status vector.
 * @param[in] para parameter structure
 * @param[in] seed a 32-bit integer used as the seed.
 */
inline void mtgp32_init_state(
    uint32_t array[],
    const mtgp32_params_fast_t *para, 
    uint32_t seed) 
    restrict(cpu, amp)
{
    int i;
    int size = para->mexp / 32 + 1;
    uint32_t hidden_seed;
    uint32_t tmp;
    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
    tmp = hidden_seed;
    tmp += tmp >> 16;
    tmp += tmp >> 8;
    //memset(array, tmp & 0xff, sizeof(uint32_t) * size);
    uint32_t tmp2 = tmp & 0xff;
    for(int j=0; j<size; ++j) array[j] = (tmp2 << 24) | (tmp2 << 16) | (tmp2 << 8) | tmp2;
    array[0] = seed;
    array[1] = hidden_seed;
    for (i = 1; i < size; i++) {
        array[i] ^= UINT32_C(1812433253) * (array[i - 1] ^ (array[i - 1] >> 30)) + i;
    }
}

/*
 * PRIVATE INLINE FUNCTIONS
 */
/**
 * This is a recursion formula of the generator.
 * MTGP32 is a 32-bit generator, but using 32-bit operations to fit to
 * graphic processors.

 * @param[out] r output
 * @param[in] x1 the farthest part of state array.
 * @param[in] x2 the second farthest part of state array.
 * @param[in] y a part of state array.
 * @param[in] sh1 the shift parameter 1.
 * @param[in] sh2 the shift parameter 2.
 * @param[in] mask the bit mask parameter.
 * @param[in] tbl the matrix parameter.
 */
inline static void mtgp32_do_recursion(uint32_t *r, uint32_t x1,
                                       uint32_t x2, uint32_t y,
                                       int sh1, int sh2,
                                       uint32_t mask, const uint32_t tbl[16]) 
                                       restrict(cpu, amp)

{
    uint32_t x;

    x = (x1 & mask) ^ x2;
    x ^= x << sh1;
    y = x ^ (y >> sh2);
    *r = y ^ tbl[y & 0x0f];
}

/**
 * The state transition function.
 * @param[in,out] mtgp32 the all in one structure
 */
inline static void mtgp32_next_state(    
    const mtgp32_params_fast_t& params,
    mtgp32_status_fast_t *status
) restrict(amp)
 {
    uint32_t *array = status->array;
    int idx = status->idx;
    int pos = params.pos;
    int large_size = status->large_size;
    uint32_t large_mask = status->large_mask;
    int size = status->size;

    status->idx = (status->idx + 1) & large_mask;
    //Concurrency::direct3d_printf("idx %d %d", idx, status->idx);
    idx = status->idx;
    mtgp32_do_recursion(&(array[idx]),
                        array[(idx - size + large_size) & large_mask],
                        array[(idx - size + large_size + 1) & large_mask],
                        array[(idx + pos - size + large_size) & large_mask],
                        params.sh1,
                        params.sh2,
                        params.mask,
                        params.tbl);
}
inline static void mtgp32_next_state(    
    const mtgp32_params_fast_t& params,
    mtgp32_status_fast_t *status
) restrict(cpu)
 {
    uint32_t *array = status->array;
    int idx = status->idx;
    int pos = params.pos;
    int large_size = status->large_size;
    uint32_t large_mask = status->large_mask;
    int size = status->size;

    status->idx = (status->idx + 1) & large_mask;
    idx = status->idx;
    mtgp32_do_recursion(&(array[idx]),
                        array[(idx - size + large_size) & large_mask],
                        array[(idx - size + large_size + 1) & large_mask],
                        array[(idx + pos - size + large_size) & large_mask],
                        params.sh1,
                        params.sh2,
                        params.mask,
                        params.tbl);
}

/**
 * The tempering function.
 * @param[in] tmp_tbl the pre-computed tempering table.
 * @param[in] r the value to be tempered.
 * @param[in] t the tempering helper value.
 * @return the tempered value.
 */
inline static uint32_t mtgp32_temper(const uint32_t tmp_tbl[16],
                                     uint32_t r, uint32_t t) {
    t ^= t >> 16;
    t ^= t >> 8;
    r ^= tmp_tbl[t & 0x0f];
    return r;
}

/**
 * The tempering and converting function.
 * @param[in] flt_tmp_tbl the pre-computed tempering table.
 * @param[in] r the value to be tempered.
 * @param[in] t the tempering helper value.
 * @return the tempered value.
 */
inline static float mtgp32_temper_float(const uint32_t flt_tmp_tbl[16],
                                          uint32_t r, uint32_t t) {
    union {
        uint32_t u;
        float f;
    } x;
    t ^= t >> 16;
    t ^= t >> 8;
    r = r >> 9;
    x.u = r ^ flt_tmp_tbl[t & 0x0f];
    return x.f;
}

/**
 * The tempering and converting function for generating floating point
 * number \b f (0 < \b f < 1).
 * @param[in] flt_tmp_tbl the pre-computed tempering table.
 * @param[in] r the value to be tempered.
 * @param[in] t the tempering helper value.
 * @return the tempered value.
 */
inline static float mtgp32_temper_float_open(
    const uint32_t flt_tmp_tbl[16],
    uint32_t r, 
    uint32_t t) 
    restrict(cpu, amp)
{
    union {
        uint32_t u;
        float f;
    } x;
    t ^= t >> 16;
    t ^= t >> 8;
    r = r >> 9;
    x.u = (r ^ flt_tmp_tbl[t & 0x0f]) | 1;
    return x.f;
}

///*
// * PUBLIC INLINE FUNCTIONS
// */
///**
// * This function generates and returns 32-bit unsigned integer.
// * mtgp32_init(), mtgp32_init_by_array() or mtgp32_init_by_str() must
// * be called before this function.
// *
// * @param[in,out] mtgp32 MTGP all in one structure.
// * @return 32-bit unsigned integer.
// */
//inline static uint32_t mtgp32_genrand_uint32(mtgp32_fast_t *mtgp32) {
//    int idx;
//    uint32_t *tmp_tbl = mtgp32->params.tmp_tbl;
//    uint32_t *array = mtgp32->status->array;
//    int pos = mtgp32->params.pos;
//    int large_size = mtgp32->status->large_size;
//    int size = mtgp32->status->size;
//    uint32_t large_mask = mtgp32->status->large_mask;
//
//    mtgp32_next_state(mtgp32);
//    idx = mtgp32->status->idx;
//    return mtgp32_temper(tmp_tbl,
//                         array[idx],
//                         array[(idx + pos - 1 - size + large_size)
//                               & large_mask]);
//}
//
///**
// * This function generates and returns single precision pseudorandom
// * number which distributes uniformly in the range [1, 2).
// * mtgp32_init(), mtgp32_init_by_array() or mtgp32_init_by_str() must
// * be called before this function.
// *
// * @param[in,out] mtgp32 MTGP all in one structure.
// * @return single precision floating point pseudorandom number
// */
//inline static float mtgp32_genrand_close1_open2(mtgp32_fast_t *mtgp32) {
//    int idx;
//    uint32_t *flt_tmp_tbl = mtgp32->params.flt_tmp_tbl;
//    uint32_t *array = mtgp32->status->array;
//    int pos = mtgp32->params.pos;
//    int large_size = mtgp32->status->large_size;
//    int size = mtgp32->status->size;
//    uint32_t large_mask = mtgp32->status->large_mask;
//
//    mtgp32_next_state(mtgp32);
//    idx = mtgp32->status->idx;
//    return mtgp32_temper_float(flt_tmp_tbl,
//                                array[idx],
//                                array[(idx + pos - 1 - size + large_size)
//                                      & large_mask]);
//}
//
///**
// * This function generates and returns single precision pseudorandom
// * number which distributes uniformly in the range [0, 1).
// * mtgp32_init(), mtgp32_init_by_array() or mtgp32_init_by_str() must
// * be called before this function.
// *
// * @param[in,out] mtgp32 MTGP all in one structure.
// * @return single precision floating point pseudorandom number
// */
//inline static float mtgp32_genrand_close_open(mtgp32_fast_t *mtgp32) {
//    return mtgp32_genrand_close1_open2(mtgp32) - 1.0F;
//}
//
///**
// * This function generates and returns single precision pseudorandom
// * number which distributes uniformly in the range (0, 1].
// * mtgp32_init(), mtgp32_init_by_array() or mtgp32_init_by_str() must
// * be called before this function.
// *
// * @param[in,out] mtgp32 MTGP all in one structure.
// * @return single precision floating point pseudorandom number
// */
//inline static float mtgp32_genrand_open_close(mtgp32_fast_t *mtgp32) {
//    return 2.0F - mtgp32_genrand_close1_open2(mtgp32);
//}

/**
 * This function generates and returns single precision pseudorandom
 * number which distributes uniformly in the range (0, 1).
 * mtgp32_init(), mtgp32_init_by_array() or mtgp32_init_by_str() must
 * be called before this function.
 *
 * @param[in,out] mtgp32 MTGP all in one structure.
 * @return single precision floating point pseudorandom number
 */
inline static float mtgp32_genrand_open_open(
    //mtgp32_fast_t *mtgp32
    const mtgp32_params_fast_t& params,
    mtgp32_status_fast_t *status
    ) 
    restrict(cpu, amp)
{
    float r;
    int idx;
    const uint32_t *flt_tmp_tbl = params.flt_tmp_tbl;
    uint32_t *array = status->array;
    int pos = params.pos;
    int large_size = status->large_size;
    int size = status->size;
    uint32_t large_mask = status->large_mask;
    mtgp32_next_state(params, status);
    idx = status->idx;
    r = mtgp32_temper_float_open(
        flt_tmp_tbl,
        array[idx],
        array[(idx + pos - 1 - size + large_size) & large_mask]);
    return r - 1.0F;
}
